Data preparations

load("XSTSF_production.RData")
source('functions.R')

f0_all_ct <- f0_all_pre %>% filter(focus_condition == 'ct' ) %>% 
  group_by(speaker) %>% 
  mutate(norm_f0 = scale(log(f0))) %>% 
  ungroup()

f0_di_ct_lcmh <- f0_all_ct %>% 
  filter(syntax %in% c('L', 'M') & diortri == 'di') %>% 
  mutate(sandhi_tone = case_when(sandhi_tone == 'HLLM' ~ 'HMML',
                                 sandhi_tone == 'LLHL' ~ 'LLRF', 
                                 .default = sandhi_tone)) %>% 
  filter(!ind_no %in% c('S2_1_ct', 'S2_11_ct', 'S2_27_ct', 'S3_5_ct', 'S3_19_ct', 'S5_27_ct',
                        'S2_44_ct', 'S3_37_ct', 'S3_44_ct', 'S6_16_ct', 'S6_31_ct', 'S6_39_ct', 'S7_33_ct')) %>% 
  filter(is.na(sandhi_tone) == FALSE) 
  
f0_di_ct_lcmh_h <- f0_di_ct_lcmh %>% filter( grepl('^H', sync_tone1))
f0_di_ct_lcmh_l <- f0_di_ct_lcmh %>% 
  filter( grepl('^[LR]', sync_tone1)) 
 # mutate(sandhi_tone = ifelse(sandhi_tone == 'LLHH' & hist_tone1 == 'yangping', 
                              #'LLLM', sandhi_tone))

f0_di_ct_lc_h <- f0_di_ct_lcmh_h %>% filter(syntax == 'L')
f0_di_ct_mh_h <- f0_di_ct_lcmh_h %>% filter(syntax == 'M')
f0_di_ct_lc_l <- f0_di_ct_lcmh_l %>% filter(syntax == 'L')
f0_di_ct_mh_l <- f0_di_ct_lcmh_l %>% filter(syntax == 'M')

f0_di_ct_lcmh_hp <- f0_di_ct_lcmh_h %>% filter(hist_tone1 == 'yinping')
f0_di_ct_lcmh_hs <- f0_di_ct_lcmh_h %>% filter(hist_tone1 == 'yinshang')
f0_di_ct_lcmh_lp <- f0_di_ct_lcmh_l %>% filter(hist_tone1 == 'yangping')
f0_di_ct_lcmh_ls <- f0_di_ct_lcmh_l %>% filter(hist_tone1 == 'yangshang')

Initial data inspection

# yinping-initial LC & MH
ggplotly(draw_by(f0_di_ct_lcmh_hp, 'speaker'), tooltip = c('text', 'x'))
# yinshang-initial LC & MH
ggplotly(draw_by(f0_di_ct_lcmh_hs, 'speaker'), tooltip = c('text', 'x'))
# yangping-initial LC & MH
ggplotly(draw_by(f0_di_ct_lcmh_lp, 'speaker'), tooltip = c('text', 'x'))
# yangshang-initial LC & MH
ggplotly(draw_by(f0_di_ct_lcmh_ls, 'speaker'), tooltip = c('text', 'x'))

Perceptual categorisation

unique(f0_di_ct_lcmh_l$sandhi_tone) # check the labels
## [1] "LLHH" "LLRF" "LMML" "LLLM"
p_cluster(f0_di_ct_lcmh_l, sandhi_tone)

p_cluster(f0_di_ct_lcmh_lp, sandhi_tone)

p_cluster(f0_di_ct_lcmh_ls, sandhi_tone)

Distribution analysis

# monosyllabic tone (initial tone)
distri_count(f0_di_ct_lcmh_l, speaker, sync_tone1)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

distri_count(f0_di_ct_lcmh_l, hist_tone1, sync_tone1)

# by second tone
distri_count(f0_di_ct_lcmh_l, sync_tone2, sandhi_tone)

distri_count(f0_di_ct_lcmh_l, sync_tone2, sandhi_tone, hist_tone1)

distri_count(f0_di_ct_lc_l, sync_tone2, sandhi_tone)+ggtitle('Lexical compounds')

distri_count(f0_di_ct_mh_l, sync_tone2, sandhi_tone)+ggtitle('Adjective-Noun phrases')

distri_count(f0_di_ct_lcmh_l, hist_tone2, sandhi_tone)

# by first tone
distri_count(f0_di_ct_lcmh_l, sync_tone1, sandhi_tone)

distri_count(f0_di_ct_lcmh_l, hist_tone1, sandhi_tone)

distri_count(f0_di_ct_lcmh_l, hist_tone1, sandhi_tone, syntax)

# by speaker
distri_count(f0_di_ct_lcmh_l, speaker, sandhi_tone, hist_tone1)

# by item
distri_count(f0_di_ct_lcmh_lp, citation_no, sandhi_tone)

distri_count(f0_di_ct_lcmh_ls, citation_no, sandhi_tone)

distri_prop(f0_di_ct_lcmh_l, hist_tone1, sync_tone1)

distri_prop(f0_di_ct_lcmh_l, sync_tone2, hist_tone2)

distri_prop(f0_di_ct_lcmh_l, syntax, sandhi_tone)

distri_prop(f0_di_ct_lcmh_l, hist_tone1, sandhi_tone)

distri_prop(f0_di_ct_lcmh_l, sync_tone2, sandhi_tone, hist_tone1)

distri_prop(f0_di_ct_lcmh_l, hist_tone2, sandhi_tone, hist_tone1)

distri_prop(f0_di_ct_lcmh_l, speaker, sandhi_tone, hist_tone1)

distri_prop(f0_di_ct_lcmh_ls, speaker, sandhi_tone, sync_tone2)

distri_prop(f0_di_ct_lcmh_l, syntax, sandhi_tone, hist_tone1)

# Gradience

f0_di_ct_lcmh_l_llhh <- filter(f0_di_ct_lcmh_l, sandhi_tone %in% c('LLHH', 'LLLM')) %>% 
  mutate(sandhi_tone = paste0(sandhi_tone, '_', hist_tone1))
p_sub_cluster(f0_di_ct_lcmh_l_llhh, sandhi_tone)
## `summarise()` has grouped output by 'sandhi_tone', 'syllable_no'. You can
## override using the `.groups` argument.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Group-by different factors

# second tone [synchronic] & syntax
p_sub_cluster(f0_di_ct_lcmh_l, sync_tone2, sandhi_tone)

# LLLM
f0_di_ct_lp_lllm <- f0_di_ct_lcmh_lp %>% filter(sandhi_tone == 'LLLM')
p_cluster(f0_di_ct_lp_lllm, sync_tone2, 'sandhi_tone')

p_sub_cluster(f0_di_ct_lp_lllm, sync_tone2, sandhi_tone)

# LLLM: LC
f0_di_ct_lc_lp_lllm <- f0_di_ct_lcmh_lp %>% filter(sandhi_tone == 'LLLM' & syntax == 'L')
p_sub_cluster(f0_di_ct_lc_lp_lllm, sync_tone2, sandhi_tone)

# LLLM: AN
f0_di_ct_mh_lp_lllm <- f0_di_ct_lcmh_lp %>% filter(sandhi_tone == 'LLLM' & syntax == 'M')
p_sub_cluster(f0_di_ct_mh_lp_lllm, sync_tone2, sandhi_tone)

# yangping & yangshang 
p_sub_cluster(f0_di_ct_lcmh_lp, sync_tone2, sandhi_tone)

p_sub_cluster(f0_di_ct_lp_lllm, sync_tone2)

p_sub_cluster(f0_di_ct_lp_lllm, sync_tone2, syntax)

p_sub_cluster(f0_di_ct_lcmh_ls, sync_tone2, sandhi_tone)

f0_di_ct_lp_lllm %>% filter(sync_tone2 == 'HL')
## # A tibble: 20 × 20
##    speaker token diortri focus_condition focus_no citation_tone sandhi_tone
##    <fct>   <chr> <chr>   <chr>              <dbl> <chr>         <chr>      
##  1 S4      皮袄  di      ct                    NA RF_HL         LLLM       
##  2 S4      皮袄  di      ct                    NA RF_HL         LLLM       
##  3 S4      皮袄  di      ct                    NA RF_HL         LLLM       
##  4 S4      皮袄  di      ct                    NA RF_HL         LLLM       
##  5 S4      皮袄  di      ct                    NA RF_HL         LLLM       
##  6 S4      皮袄  di      ct                    NA RF_HL         LLLM       
##  7 S4      皮袄  di      ct                    NA RF_HL         LLLM       
##  8 S4      皮袄  di      ct                    NA RF_HL         LLLM       
##  9 S4      皮袄  di      ct                    NA RF_HL         LLLM       
## 10 S4      皮袄  di      ct                    NA RF_HL         LLLM       
## 11 S4      皮袄  di      ct                    NA RF_HL         LLLM       
## 12 S4      皮袄  di      ct                    NA RF_HL         LLLM       
## 13 S4      皮袄  di      ct                    NA RF_HL         LLLM       
## 14 S4      皮袄  di      ct                    NA RF_HL         LLLM       
## 15 S4      皮袄  di      ct                    NA RF_HL         LLLM       
## 16 S4      皮袄  di      ct                    NA RF_HL         LLLM       
## 17 S4      皮袄  di      ct                    NA RF_HL         LLLM       
## 18 S4      皮袄  di      ct                    NA RF_HL         LLLM       
## 19 S4      皮袄  di      ct                    NA RF_HL         LLLM       
## 20 S4      皮袄  di      ct                    NA RF_HL         LLLM       
## # ℹ 13 more variables: syllable_no <dbl>, citation_no <fct>, ind_no <chr>,
## #   sync_tone1 <chr>, sync_tone2 <chr>, sync_tone3 <chr>, hist_tone1 <chr>,
## #   hist_tone2 <chr>, hist_tone3 <chr>, syntax <chr>, time <fct>, f0 <dbl>,
## #   norm_f0 <dbl[,1]>

k-means clustering

# data preparation
f0_di_ct_lcmh_l_kmeans <- f0_di_ct_lcmh_l %>% 
  select(-diortri, -syllable_no, -focus_no, -f0) %>% 
  spread(time, norm_f0)

# k-means clustering
cluster_model <- k_means_clustering(f0_di_ct_lcmh_l_kmeans)
kml(cluster_model, nbClusters = 2:10) 
##  ~ Fast KmL ~
## ***************************************************************************************************S
## 100 ********************************************************************************S
kml::plot(cluster_model, 3, parTraj=parTRAJ(col="clusters"))

kml::plot(cluster_model, 4, parTraj=parTRAJ(col="clusters"))

plotAllCriterion(cluster_model)

# get cluster results
f0_di_ct_lcmh_l_kmeans <- f0_di_ct_lcmh_l_kmeans %>% 
  mutate(cluster4 = getClusters(cluster_model, 4),
         cluster3 = getClusters(cluster_model, 3),
         sub_cluster = paste0(sandhi_tone, '_', cluster4)) %>% 
  mutate(sandhi_tone = ifelse(cluster4 == 'C' & sandhi_tone == 'LLLM', 'LLRF', sandhi_tone))

# heatmap distribution
cluster_solution <- wide_to_long(f0_di_ct_lcmh_l_kmeans) 
heatmap_df <- heatmap_data(cluster_solution, cluster3)
compare_cluster(heatmap_df, 'cluster3')

cluster_solution <- wide_to_long(f0_di_ct_lcmh_l_kmeans) 
heatmap_df <- heatmap_data(cluster_solution, cluster4)
compare_cluster(heatmap_df, 'cluster4')

cluster_solution %>% filter(cluster4 == 'A' & sandhi_tone == 'LLLM' & time == 1)
## # A tibble: 15 × 20
##    speaker token focus_condition citation_tone sandhi_tone citation_no ind_no  
##    <fct>   <chr> <chr>           <chr>         <chr>       <fct>       <chr>   
##  1 S1      黄花  ct              RF_HH         LLLM        48          S1_48_ct
##  2 S2      杨梅  ct              RF_RF         LLLM        35          S2_35_ct
##  3 S2      黄树  ct              RF_LH         LLLM        52          S2_52_ct
##  4 S3      杨梅  ct              RF_LH         LLLM        35          S3_35_ct
##  5 S5      杨梅  ct              RF_RF         LLLM        35          S5_35_ct
##  6 S5      皮鞋  ct              RF_RF         LLLM        39          S5_39_ct
##  7 S5      黄树  ct              RF_LH         LLLM        52          S5_52_ct
##  8 S5      黄绳  ct              RF_RF         LLLM        28          S5_28_ct
##  9 S5      黄豆  ct              RF_LH         LLLM        41          S5_41_ct
## 10 S7      黄树  ct              RF_LH         LLLM        52          S7_52_ct
## 11 S7      黄绳  ct              RF_RF         LLLM        28          S7_28_ct
## 12 S7      黄豆  ct              RF_LH         LLLM        41          S7_41_ct
## 13 S8      杨梅  ct              RF_RF         LLLM        35          S8_35_ct
## 14 S8      黄绳  ct              RF_RF         LLLM        28          S8_28_ct
## 15 S8      黄豆  ct              RF_LH         LLLM        41          S8_41_ct
## # ℹ 13 more variables: sync_tone1 <chr>, sync_tone2 <chr>, sync_tone3 <chr>,
## #   hist_tone1 <chr>, hist_tone2 <chr>, hist_tone3 <chr>, syntax <chr>,
## #   cluster4 <fct>, cluster3 <fct>, sub_cluster <chr>, time <int>,
## #   norm_f0 <dbl>, syllable_no <chr>

Examine mismatches

cluster_lllm <- cluster_solution %>% filter(sandhi_tone == 'LLLM')
p_cluster(cluster_lllm, sub_cluster)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.